import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
%matplotlib inline
df=pd.read_csv("vehicle-1.csv")
df.head()
df.info()
This data contains 19 variables where target variable 'class' is categorical and others are independent variable with float and intergers as datatype
pd.value_counts(df['class'])#Count of target variable
sns.countplot(df['class'])#plotting count of target variable
We could see that values are not well balanced dataset
Car have highest number of datapoints then van and bus
#Label Encoding 'class' variable
from sklearn.preprocessing import LabelEncoder
enc=LabelEncoder()
df['class']=enc.fit_transform(df['class'])
Labelling the target variable 'class' with intergers using label encoder so that we could use in our model
#measuring skewness of data
df.skew()
pr.axis_aspect_ratio, max.length_aspect_ratio and scaled_radius_of_gyration.1 are postively skewed
df.describe()
There are total 846 records but there are some missing values in the variables when we see the count of variables
# To get total number of missing values
missing=pd.DataFrame(df.isna().sum(),columns=["sum of missing values"])#By varaible-number of missing value
print("Total number of missing values: {} ".format(missing["sum of missing values"].sum()))#total missing value
missing
Total number of missing values are 41, we could see
#Correlation Matrix
df_cor=df.corr()
plt.figure(figsize=(14,8))
sns.heatmap(df_cor,vmax=1.0,vmin=-1.0,annot=True)
On the above correlation plot,
We could see strong negative relationship between independent variables
Strong postive relationship are
sns.pairplot(df,hue='class')#pairplot to visualize distribution and relationship of variables
We could see that there is perfect relationship between some independent variables as we discussed earlier in correlation plot
#Density plot for all the variables to visualize the distribution of data
df_len=df.columns
unique_vals = df['class'].unique()
# Sort the dataframe by target
# Use a list comprehension to create list of sliced dataframes
targets = [df.loc[df['class'] == val] for val in unique_vals]
# Iterate through list and plot the sliced dataframe
plt.figure(figsize=(25,3))
for i in range(1,10):
plt.subplot(1,10,i)
for target in targets:
sns.distplot(target[df_len[i]], hist=False)
plt.show()
plt.figure(figsize=(25,3))
for i in range(1,9):
plt.subplot(1,9,i)
for target in targets:
sns.distplot(target[df_len[i+9]], hist=False)
plt.show()
We could see that 'pr.axis_aspect_ratio','max_lenght_aspect_ratio' and 'scaled_radius_of_gyration_1' are postively skewed Other variables are like 'scaled_variance', 'scaled_variance_1', 'elongatedness' and 'distance_circularity' have two bumps and not closely follow normal distribution
#Standardizing the data
X=df.drop('class', axis=1)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
XScaled=pd.DataFrame(scaler.fit_transform(X), columns = ['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio'])
XScaled=XScaled.drop(["scatter_ratio","circularity","scaled_variance.1"],axis=1)#Removing "scaled_variance.1,"circularity","scatter_ratio"
Transformed independent variables using standard scalar to removed the units of measurement for each independent
#Imputing Median values to missing values
import numpy as np
from sklearn.impute import SimpleImputer
X_df = SimpleImputer(missing_values=np.nan, strategy='median')
XScaled=pd.DataFrame(X_df.fit_transform(XScaled),columns=['compactness', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance','scaled_radius_of_gyration','scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio'])
Imputed the 41 missing values with the median value of the respective variable
#Correlation plot of independent variables after removing strongly correlated variables and impution of missing variable
XScaled_cor=XScaled.corr()
plt.figure(figsize=(14,8))
sns.heatmap(XScaled_cor,vmax=1.0,vmin=-1.0,annot=True)
After removing highly correlated variables and imputing median values to missing values, correlation plot have variables which are not highly correlated and proceeding with these variables for model building which avoids the probelm of multicollinearity
#Box plot to visualize outlier and spread of data
plt.figure(figsize=(25,8))
sns.boxplot(x="variable", y="value", data=pd.melt(XScaled))
We could see the presence of outlier and skweness in 'radius_ratio', 'pr.axis_aspect_ratio','max_length_aspect_ratio', 'scaled_radius_of_gyration_1'
Outliers in scaled_variance 1, scaled_variance, 'skewness_about', 'skewness_about 1'
#Quantile based Flooring and Capping
X_col=XScaled.columns
for i in range(0,len(X_col)):
q1=XScaled[X_col[i]].quantile(0.10)
q3=XScaled[X_col[i]].quantile(0.90)
XScaled[X_col[i]] = np.where(XScaled[X_col[i]] <q1, q1,XScaled[X_col[i]])#Flooring
XScaled[X_col[i]] = np.where(XScaled[X_col[i]] >q3, q3,XScaled[X_col[i]])#Capping
print("Skewed values after Quantile based imputation for outlier")
print(XScaled.skew())
From the above code we have done the outlier detection using 10% quantile for low values and 90% quantile for hig values
For treating the outlier values, imputing high value with 90% quantile value(capping) and imputing low value with 10% quantile value (flooring)
Printed skewed values after outlier imputation and we could see that the data is not skewed
plt.figure(figsize=(25,8))
sns.boxplot(x="variable", y="value", data=pd.melt(XScaled))
After impution of flooring and capping values usin inter quartile range value, we could see all the values lies within the whiskers of box plot
#Splitting of Train-Test dataset
seed=100
from sklearn.model_selection import train_test_split
y=df['class']
X_train, X_test, y_train, y_test = train_test_split(XScaled,y, test_size = 0.3, random_state = seed)
Splitting the data with 70% train data and 30% of test data
#fittind SVC model with the original independent variables
from sklearn.svm import SVC
svc_clf=SVC()
svc_clf.fit(X_train,y_train)
print ('Accuracy score with Original variables using train-test split: {} %'.format(np.round(svc_clf.score(X_test, y_test)*100)))
# K-fold cross validation for Support Vector
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn import metrics
skf=KFold(n_splits=10,shuffle=True,random_state=seed)
i=1
cv_score=[]
for train,test in skf.split(XScaled,y):
print('{} of KFold {}'.format(i,skf.n_splits))
train_X, test_X = XScaled.loc[train], XScaled.loc[test]
train_y, test_y = y.loc[train], y.loc[test]
#SVC Model
svc_clf=SVC() #call the function
svc_clf.fit(train_X,train_y)# fit the model
#make prediction
svc_clf_test_pred=svc_clf.predict(test_X)
svc_clf_train_pred=svc_clf.predict(train_X)
score=metrics.accuracy_score(test_y,svc_clf_test_pred)
print("Accuracy for test data: {}".format(score))
cv_score.append(score)
i+=1
print("Mean of K-Fold Cross Validation Accuracy score with Original Variables: {} %".format(np.round(np.mean(cv_score)*100),2))
# PCA - 7 Components
pca7 = PCA(n_components=7)
pca7.fit(XScaled)
print(pca7.components_)
print(pca7.explained_variance_ratio_)
print("Cumulative variance Explained {}".format(np.cumsum(pca7.explained_variance_ratio_)))
We selected 7 PCA components as it explains the variabilty upto 96.1 % as mentioned in above output
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(10 , 5))
plt.bar(list(range(1,8)),pca7.explained_variance_ratio_,alpha=0.5, align='center', label = 'Individual explained variance')
plt.step(list(range(1,8)),np.cumsum(pca7.explained_variance_ratio_), where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
In the above plot,
1st PCA explains 51.7% variation
2nd PCA explains 21.4% variation
3rd PCA explains 7.7% variation
4th PCA explains 6.7% variation
5th PCA explains 3.8% variation
6th PCA explains 2.9% variation
7th PCA explains 1.7% variation
Total variation explained by 7 PCA components are 96.1%
sns.pairplot(pd.DataFrame(Xpca7))
We could see all the variable follow normal distribution at the diagonal plots
In bivariate plots, does not have any relationship with other principal components
Xpca7 = pd.DataFrame(pca7.transform(XScaled))
#Train - Test dataset split
y=df['class']
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(Xpca7,y, test_size = 0.3, random_state = seed)
#Fitting SVC
svc_clf.fit(X_train_pca, y_train_pca)
print ('Accuracy score with PCA components using train-test split: {} %'.format(np.round(svc_clf.score(X_test_pca, y_test_pca)*100,2)))
#K-fold cross validation for Support Vector with principal components
from sklearn import metrics
skf=KFold(n_splits=10,shuffle=True,random_state=seed)
i=1
cv_score=[]
for train,test in skf.split(Xpca7,y):
print('{} of KFold {}'.format(i,skf.n_splits))
train_X, test_X = Xpca7.loc[train], Xpca7.loc[test]
train_y, test_y = y.loc[train], y.loc[test]
#SVC Model
svc_clf_pca=SVC() #call the function
svc_clf_pca.fit(train_X,train_y)# fit the model
#make prediction
svc_clf_test_pca=svc_clf_pca.predict(test_X)
svc_clf_train_pca=svc_clf_pca.predict(train_X)
score=metrics.accuracy_score(test_y,svc_clf_test_pca)
print("Accuracy for test data: {}".format(score))
cv_score.append(score)
i+=1
print("Mean of K-Fold Cross Validation Accuracy score with PCA components: {} %".format(np.round(np.mean(cv_score)*100,2)))
From the above analysis we could get the following results,
By applying Support Vector Machine model to original variable,
We have selected 7 principal components as we need to select around 96.1% variability explained components By applying Support Vector Machine model to principal components,
This helps to build model with dimensionality reduction using principal components
When We compare the above models,
By accuracy we can select the model with raw data using Kfold cross validation which attains highest test accuracy 96%
If we have concerns with number of independent variables we can select 7 principal component model using Kfold cross validation which attains test accuracy 90.6%
PCA component model's independent variables cannot be explained easily since it is complex combination of independent variables whereas raw data model's independent variables can be easily explained in the model